%% empirical_wagner.m - Empirical Application: New Hampshire Three-Drug Cap Policy
%
% Replicates Table 3 in:
%   Lin, S.-Y., Tsay, W.-J., and Wong, H.-C. (2026), "Robust Inference for
%   Interrupted Time Series Models with Serial Dependent Disturbances."
%
% This script applies the proposed rescaled t-statistics and the studentized
% sieve bootstrap to the New Hampshire Medicaid "three-drug cap" policy data
% originally studied by:
%
%   Wagner, A.K., Soumerai, S.B., Zhang, F., and Ross-Degnan, D. (2002),
%   "Segmented regression analysis of interrupted time series studies in
%   medication use research," Journal of Clinical Pharmacy and Therapeutics
%   27, 299-309.
%
% NOTE ON DATA:
%   The data file 'wagner_data_2002.txt' is NOT included in this replication
%   package because it belongs to the original authors. Interested readers
%   should refer to Wagner et al. (2002) or contact the original authors for
%   data access. Once the file is available, place it in the 'Data/' folder
%   relative to this script and re-run.
%
% DEPENDENCIES:
%   compute_ols_inference.m, choose_best_ar_model.m,
%   generate_arma_process_pq.m, kernelEstimator.m
%
% SETTINGS (matching Table 3 in the paper):
%   T  = 30 (31 original months, excluding month 20 - anticipatory spike)
%   T1 = 19 (intervention occurs at re-indexed t = 20)
%   xi = 19/30
%   Bandwidth: S_T = floor(0.75 * T^(1/3))   [Andrews 1991]
%   Bootstrap: B = 100,000 replications (AIC and BIC lag selection)

clear all
close all
clc

rng(8964);  % Fix seed for reproducibility

%% =========================================================================
%  STEP 1: LOAD DATA
%  =========================================================================
%  The data file has the following columns:
%    Col 1: original observation index (1-48)
%    Col 2: y (mean number of prescriptions per patient per month)
%    Col 3: original time index t
%    Col 4: d_t (post-intervention indicator)
%    Col 5: (t - T1) * d_t (post-intervention time trend)
%    Col 6-7: additional columns (not used in this analysis)

data_file = fullfile('..', 'Data', 'wagner_data_2002.txt');

if ~exist(data_file, 'file')
    error(['Data file not found: %s\n\n' ...
           'The Wagner et al. (2002) data are not included in this\n' ...
           'replication package. Please refer to the original paper:\n\n' ...
           '  Wagner et al. (2002), Journal of Clinical Pharmacy\n' ...
           '  and Therapeutics 27, 299-309.\n\n' ...
           'Once obtained, place the file at: %s'], data_file, data_file);
end

raw = load(data_file);

% Use observations 1-31, excluding row 20 (anticipatory demand spike
% immediately before the three-drug cap; see Wagner et al. 2002 and
% Section 5 of the paper).
rows_to_use = [1:19, 21:31];
y = raw(rows_to_use, 2);   % outcome: mean prescriptions per patient

%% =========================================================================
%  STEP 2: DESIGN MATRIX
%  =========================================================================
T  = length(y);     % 30 observations after exclusion
T1 = 19;            % pre-intervention period length
xi = T1 / T;        % pre-intervention fraction (= 19/30)

t_index    = (1:T)';
d_t        = [zeros(T1, 1);     ones(T - T1, 1)  ];   % post-intervention indicator
trend_post = [zeros(T1, 1);     (1:(T - T1))'    ];   % (t - T1)*d_t

X = [ones(T, 1), t_index, d_t, trend_post];

fprintf('============================================================\n');
fprintf('Empirical Application: New Hampshire Three-Drug Cap Policy\n');
fprintf('============================================================\n');
fprintf('Total observations (T)       : %d\n', T);
fprintf('Pre-intervention length (T1) : %d\n', T1);
fprintf('Pre-intervention fraction xi : %.4f\n', xi);

%% =========================================================================
%  STEP 3: BANDWIDTH
%  =========================================================================
% Andrews (1991) rule-of-thumb for the Bartlett kernel, AR(1) coefficient 0.25.
% In the paper: S_T = floor(0.75 * T^(1/3)).
% In compute_ols_inference the bandwidth argument is S_T + 1.
S_T          = floor(0.75 * T^(1/3));
nw_lag_trunc = S_T + 1;

fprintf('Bandwidth (S_T)              : %d\n', S_T);

%% =========================================================================
%  STEP 4: OLS + RESCALED t-STATISTICS  (Eqs. 18-21 of the paper)
%  =========================================================================
null_beta = zeros(4, 1);   % H0: beta_i = 0 for all i

[~, est_ols, residuals, test_ols, ~] = compute_ols_inference(...
    y, X, xi, nw_lag_trunc, null_beta, true);

param_labels = {'beta_0 (Intercept)       ', ...
                'beta_1 (Baseline trend)  ', ...
                'beta_2 (Level change)    ', ...
                'beta_3 (Trend change)    '};

fprintf('\n--- OLS Estimates and Rescaled t-Statistics ---\n');
fprintf('%-30s %12s %12s\n', 'Parameter', 'Estimate', 'Rescaled t');
fprintf('%s\n', repmat('-', 1, 56));
for i = 1:4
    fprintf('%-30s %12.4f %12.4f\n', param_labels{i}, est_ols(i), test_ols(i));
end

%% =========================================================================
%  STEP 5: STUDENTIZED SIEVE BOOTSTRAP  (Section 3.1 of the paper)
%  =========================================================================
p_max         = 8;         % maximum AR order for sieve selection
num_bootstrap = 100000;    % bootstrap replications (matches Table 3)
alpha         = 0.05;      % nominal significance level
p_quantile    = 1 - alpha / 2;   % = 0.975
burnin        = 10000;     % burn-in for bootstrap error generation

fprintf('\n--- Studentized Sieve Bootstrap (B = %d) ---\n', num_bootstrap);

% Step 1-2: Fit AR(p) to OLS residuals; select order by AIC and BIC
[best_p_aic, best_model_aic, best_p_bic, best_model_bic, ~, ~] = ...
    choose_best_ar_model(residuals, p_max);

fprintf('AR order selected: AIC = %d,  BIC = %d\n', best_p_aic, best_p_bic);

% Step 3: Center the AR innovations (divide by T - p, not T)
AR_resid_aic = infer(best_model_aic, residuals);
AR_resid_bic = infer(best_model_bic, residuals);
AR_resid_aic = AR_resid_aic - sum(AR_resid_aic) / (T - best_p_aic);
AR_resid_bic = AR_resid_bic - sum(AR_resid_bic) / (T - best_p_bic);

% Steps 4-5: Generate bootstrap error series by resampling innovations
[~, eps_star_aic] = generate_arma_process_pq(T, burnin, num_bootstrap, ...
    cell2mat(best_model_aic.AR), [], AR_resid_aic);
[~, eps_star_bic] = generate_arma_process_pq(T, burnin, num_bootstrap, ...
    cell2mat(best_model_bic.AR), [], AR_resid_bic);

% Step 6: Bootstrap pseudo-samples (impose null at OLS estimates)
y_hat      = X * est_ols;
y_star_aic = y_hat + eps_star_aic;   % T x num_bootstrap
y_star_bic = y_hat + eps_star_bic;

% Steps 7-8: Compute bootstrap t-statistics
fprintf('Computing bootstrap t-statistics...\n');
boot_t_aic = zeros(4, num_bootstrap);
boot_t_bic = zeros(4, num_bootstrap);

for b = 1:num_bootstrap
    [~, ~, ~, boot_t_aic(:, b), ~] = compute_ols_inference(...
        y_star_aic(:, b), X, xi, nw_lag_trunc, est_ols, true);
    [~, ~, ~, boot_t_bic(:, b), ~] = compute_ols_inference(...
        y_star_bic(:, b), X, xi, nw_lag_trunc, est_ols, true);
end

% Step 9: Bootstrap critical value bounds at 95% level
idx_lo = ceil((1 - p_quantile) * num_bootstrap);
idx_hi = ceil(p_quantile * num_bootstrap);

cv_aic = sort(boot_t_aic, 2);
cv_bic = sort(boot_t_bic, 2);
cv_aic = cv_aic(:, [idx_lo, idx_hi]);
cv_bic = cv_bic(:, [idx_lo, idx_hi]);

%% =========================================================================
%  STEP 6: DISPLAY TABLE 3
%  =========================================================================
fprintf('\n============================================================\n');
fprintf('Table 3: Parameter Estimates, Rescaled t-Statistics, and\n');
fprintf('         95%% Bootstrapped Critical Value Bounds\n');
fprintf('============================================================\n');
fprintf('%-28s %10s %11s   %16s   %16s\n', ...
    'Parameter', 'Estimate', 'Rescaled t', ...
    '95% Boot CV (AIC)', '95% Boot CV (BIC)');
fprintf('%s\n', repmat('-', 1, 88));
for i = 1:4
    fprintf('%-28s %10.4f %11.4f   [%7.4f, %7.4f]   [%7.4f, %7.4f]\n', ...
        param_labels{i}, est_ols(i), test_ols(i), ...
        cv_aic(i,1), cv_aic(i,2), ...
        cv_bic(i,1), cv_bic(i,2));
end
fprintf('%s\n', repmat('-', 1, 88));
fprintf('Notes: T = %d, T1 = %d, xi = %.4f. Bandwidth S_T = %d.\n', ...
    T, T1, xi, S_T);
fprintf('Bootstrap: B = %d replications; AIC/BIC lag selection.\n', num_bootstrap);
